import sys
import unicodedata
import numpy as np
import pandas as pd
from notebook import __version__ as nbv
# matplotlib Libraries
import matplotlib.pyplot as plt
from matplotlib import __version__ as mpv
# mlxtend Libraries
from mlxtend.plotting import plot_confusion_matrix
from mlxtend import __version__ as mlxv
# NLTK Libraries
from nltk import __version__ as nltkv
from nltk import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# Sklearn Libraries
from sklearn import __version__ as skv
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_score, f1_score, accuracy_score, recall_score, confusion_matrix
# Keras Libraries
from keras import __version__ as kv
from keras import backend as K
from keras.datasets import mnist
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.convolutional import Conv2D, MaxPooling2D
# Library Versions
lib_info = [('nltk', nltkv), ('numpy', np.__version__), ('keras', kv), ('pandas', pd.__version__),
('sklearn', skv), ('mlxtend', mlxv), ('matplotlib', mpv), ('Jupyter Notebook (notebook)', nbv)]
print('Library Versions\n' + '='*16)
for name, vers in lib_info:
print('{:>27} = {}'.format(name, vers))
class TextNormalizer():
def __init__(self, data, stop_words):
self.data = data
self.stop_words = stop_words
self.porter = PorterStemmer()
def lower_txt(self, data):
return data.apply(lambda s: s.lower() if type(s) == str else s)
def remove_punct(self, data):
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
return data.apply(lambda s: str(s).translate(punctuation))
def remove_stopword(self, data):
return [' '.join([word for word in s.split()
if word not in self.stop_words])
for s in data]
def word_stem(self, data):
return data.apply(lambda word: self.porter.stem(str(word)))
Shuffle the data randomly and select the first 3,000 rows to reduce the data size and speed up analysis (the neural net takes awhile to train with large sets of data)
commentData = pd.read_json('Comment_Data/categorized-comments.jsonl', lines=True)
seed = 34
commentData = commentData.sample(frac=1, random_state=seed)[:3000].reset_index(drop=True)
commentData.head()
print('The shape of the data is: {:,} rows and {:,} columns'.format(commentData.shape[0], commentData.shape[1]))
stop_words = stopwords.words('english')
txtNorm = TextNormalizer(commentData['txt'], stop_words)
commentData['lower'] = txtNorm.lower_txt(txtNorm.data)
commentData['no_punc'] = txtNorm.remove_punct(commentData['lower'])
commentData['rem_stopwords'] = txtNorm.remove_stopword(commentData['no_punc'])
commentData['word_stem'] = txtNorm.word_stem(commentData['rem_stopwords'])
x_train, x_test, y_train, y_test = train_test_split(commentData['word_stem'],
commentData['cat'],
train_size=0.70,
random_state=seed)
x_train.head()
y_train.head()
classifier = Pipeline(steps=([
('tfidf', TfidfVectorizer()),
('clf', MLPClassifier(hidden_layer_sizes=(6, 2), random_state=seed))
]))
param_grid = {'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
'clf__learning_rate': ['constant', 'invscaling', 'adaptive'],
'clf__solver': ['lbfgs', 'sgd', 'adam'],
'clf__alpha': [1e-4, 1e-5, 1e-6]
}
class_grid = GridSearchCV(classifier, param_grid = param_grid, cv = 5, n_jobs = -1, verbose=2)
class_grid.fit(x_train, y_train)
pd.DataFrame(class_grid.cv_results_).sort_values('mean_test_score',
ascending=False)[['params', 'mean_test_score']].head(10)
round(class_grid.best_score_, 4)
print('Best Classification Parameters\n' + '='*30)
for name, val in class_grid.best_params_.items():
print('{:>16}: {}'.format(name.replace('clf__', ''), val))
y_pred = class_grid.predict(x_test)
accScore = round(accuracy_score(y_test, y_pred), 4) * 100
precScore = round(precision_score(y_test, y_pred, average='weighted', zero_division=0), 4) * 100
recallScore = round(recall_score(y_test, y_pred, average='weighted'), 4) * 100
f1_Score = round(f1_score(y_test, y_pred, average='weighted'), 4) * 100
print('Model Classification Metric Scores\n' + '='*34 +
'\n\t Accuracy: {:,.2f}%\n\tPrecision: {:,.2f}%\n\t Recall: {:,.2f}%\n\t F1: {:,.2f}%'
.format(accScore, precScore, recallScore, f1_Score))
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams.update({'font.size': 16})
plot_confusion_matrix(conf_mat=confusion_matrix(y_test, y_pred),
class_names=['Science and Technology', 'Sports', 'Video Games'],
cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
We can see that model mostly identifies comments as belonging to the "Video Games" category and interestingly enough, nothing to the "Science and Technology" Category.
Fit a neural network classifier using Keras on the comments dataset, report: accuracy, precision, recall, F1-score, and confusion matrix.
n_features = x_train.shape[0]
n_classes = len(y_train.unique())
def build_network():
nn = Sequential()
nn.add(Dense(250, activation='relu', input_dim=(n_features)))
nn.add(Dense(100, activation='relu'))
nn.add(Dense(n_classes, activation='softmax'))
nn.compile(
loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
return nn
vect = TfidfVectorizer(max_features=n_features)
keras_model = KerasClassifier(build_fn=build_network)
x_train_vect = vect.fit_transform(x_train).todense()
x_test_vect = vect.fit_transform(x_test).todense()
param_grid = {'epochs': [10, 25, 50],
'batch_size': [10, 25, 50]}
keras_grid = GridSearchCV(keras_model, param_grid = param_grid, cv = 5, n_jobs = -1, verbose=2)
keras_grid.fit(x_train_vect, y_train)
pd.DataFrame(keras_grid.cv_results_).sort_values('mean_test_score',
ascending=False)[['params', 'mean_test_score']].head(10)
round(keras_grid.best_score_, 4)
keras_grid.best_params_
y_pred = keras_grid.predict(x_test_vect)
accScore = round(accuracy_score(y_test, y_pred), 4) * 100
precScore = round(precision_score(y_test, y_pred, average='weighted', zero_division=0), 4) * 100
recallScore = round(recall_score(y_test, y_pred, average='weighted'), 4) * 100
f1_Score = round(f1_score(y_test, y_pred, average='weighted'), 4) * 100
print('Model Keras Metric Scores\n' + '='*25 +
'\n Accuracy: {:,.2f}%\n Precision: {:,.2f}%\n Recall: {:,.2f}%\n\t F1: {:,.2f}%'
.format(accScore, precScore, recallScore, f1_Score))
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams.update({'font.size': 16})
plot_confusion_matrix(conf_mat=confusion_matrix(y_test, y_pred),
class_names=['Science and Technology', 'Sports', 'Video Games'],
cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
We can see, just like in the Scikit model, the Keras model mostly identified comments as belonging to the "Video Games" category but this time there were a few comments that were mistaken for the "Science and Technology" category.
Classify MSINT images using a convolutional neural network and report the accuracy of the results.
# Set that the color channel value will be first
K.set_image_data_format("channels_last")
# Set seed
np.random.seed(0)
# Set image information
channels = 1
height = 28
width = 28
# Load data and target from MNIST data
(data_train, target_train), (data_test, target_test) = mnist.load_data()
# Reshape training image data into features
data_train = data_train.reshape(data_train.shape[0], height, width, channels)
# Reshape test image data into features
data_test = data_test.reshape(data_test.shape[0], height, width, channels)
# Rescale pixel intensity to between 0 and 1
features_train = data_train / 255
features_test = data_test / 255
# One-hot encode target
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]
# Start neural network
network = Sequential()
# Add convolutional layer with 64 filters, a 5x5 window, and ReLU activation function
network.add(Conv2D(filters=64, kernel_size=(5, 5), kernel_initializer='normal', padding='valid',
input_shape=(width, height, channels), activation='relu'))
# Add max pooling layer with a 2x2 window
network.add(MaxPooling2D(pool_size=(2, 2)))
# Add dropout layer
network.add(Dropout(0.5))
# Add layer to flatten input
network.add(Flatten())
# # Add fully connected layer of 128 units with a ReLU activation function
network.add(Dense(128, activation="relu"))
# Add dropout layer
network.add(Dropout(0.5))
# Add fully connected layer with a softmax activation function
network.add(Dense(number_of_classes, activation="softmax"))
# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
optimizer="rmsprop", # Root Mean Square Propagation
metrics=["accuracy"]) # Accuracy performance metric
network.summary()
# Train neural network
history = network.fit(features_train, # Features
target_train, # Target
epochs=2, # Number of epochs
verbose=2, # Print description after each epoch
batch_size=1000, # Number of observations per batch
validation_data=(features_test, target_test)) # Data for evaluation
print(' Keras Image Classification Accuracies\n' + '='*41)
for index, (acc, loss) in enumerate(zip(history.history['accuracy'], history.history['loss'])):
print('Epoch {:}: Accuracy = {:,.2f}%, Loss = {:,.2f}%'.format(index + 1, acc * 100, loss * 100))
The model ended with an accuracy score of ~94%
, which is excellent considering it only ran for 2 epochs.